####preparing, plotting and analyzing pupil size data

#1# extract trials with too many NAs
#2# build time windows
#3# build proportional values for each time window relative to baseline
#4# extract outliers
#5# plot data
#6# analyse data 
    #overall 
    #adults
    #children

library(lme4)
library(ggplot2)
library(psych)
library(lattice)
library(influence.ME)
library(plyr)
library(dplyr)
library(tidyr)
library(tibble)
library(readr)
library(anytime)
library(languageR)
library(stringr)
library(Rmisc)
library(psych)
library(lmerTest)
library(reshape)
library(ez)
library(fame)
options(scipen =999)




#################################################################################################################################################
### PREPARING ###################################################################################################################################
#################################################################################################################################################

setwd("")
d <- read.csv2("pupil_raw data.csv")




##############################
### CREATE BASELINE WINDOW ### 
##############################

#create baseline (= pupil size in the last 1000 ms before sentence start) per subject, item, and condition 
#filter out cases with equal/more than 75% NAs
d <- unite(d, sub_it_con, subject, item, condition, sep = "_", remove = FALSE)

#run this two rows to get a dataset with only the values of B1000 to B1900
#check for which subject_item_condition combination there are equal/more than 8 NAs
missing <- d[,c("sub_it_con", "B1000","B1100","B1200","B1300","B1400","B1500","B1600","B1700","B1800","B1900")]
missing <- missing[which(rowMeans(is.na(missing)) > .7), ] #filtert 4 oder mehr na raus

#extract those trials out of the data frame
d <- subset(d, sub_it_con != "K1L2_25_4")
d <- subset(d, sub_it_con != "K1L4_20_0")
d <- subset(d, sub_it_con != "K2L1_10_1")
d <- subset(d, sub_it_con != "K2L3_5_3")
d <- subset(d, sub_it_con != "K2L4_25_1")
d <- subset(d, sub_it_con != "K3L1_21_0")
d <- subset(d, sub_it_con != "K5L4_32_0")
d <- subset(d, sub_it_con != "K8L1_10_1")
d <- subset(d, sub_it_con != "K8L3_11_0")
d <- subset(d, sub_it_con != "K8L3_17_3")
d <- subset(d, sub_it_con != "K8L3_24_1")
d <- subset(d, sub_it_con != "K9L1_29_0")
d <- subset(d, sub_it_con != "K9L1_32_4")
d <- subset(d, sub_it_con != "P3L2_1_4")
d <- subset(d, sub_it_con != "P3L2_22_0")

#calculate baseline for the remaining trials (= mean of B1000 to B19000) 
d$b <- rowMeans(d[,c("B1000","B1100","B1200","B1300","B1400","B1500","B1600","B1700","B1800","B1900")], na.rm=TRUE, dims = 1) 




#################################
### CREATE OTHER TIME WINDOWS ### 
#################################

#other time windows = 600ms in lenght, non-overlapping, mean of pupil sizes in corresponding bins
d$s <- rowMeans(d[,c("S200", "S300","S400","S500","S600", "S700")], na.rm=TRUE, dims = 1) 
d$v <- rowMeans(d[,c("V200", "V300","V400","V500","V600", "V700")], na.rm=TRUE, dims = 1) 
d$g <- rowMeans(d[,c("G200", "G300","G400","G500","G600", "G700")], na.rm=TRUE, dims = 1) 
d$n <- rowMeans(d[,c("N200", "N300","N400","N500","N600", "N700")], na.rm=TRUE, dims = 1) 
d$p <- rowMeans(d[,c("P100", "P200","P300","P400","P500", "P600")], na.rm=TRUE, dims = 1) 


#run these 11 rows to find out for each window for which subject, item and condition combination there are equal/more than 5 NAs
missing_s <- d[,c("sub_it_con", "S200", "S300","S400","S500","S600", "S700")]
missing_s <- missing_s[which(rowMeans(is.na(missing_s)) > .6), ] 
missing_v <- d[,c("sub_it_con", "V200", "V300","V400","V500","V600", "V700")]
missing_v <- missing_v[which(rowMeans(is.na(missing_v)) > .6), ] 
missing_g <- d[,c("sub_it_con", "G200", "G300","G400","G500","G600", "G700")]
missing_g <- missing_g[which(rowMeans(is.na(missing_g)) > .6), ] 
missing_n <- d[,c("sub_it_con", "N200", "N300","N400","N500","N600", "N700")]
missing_n <- missing_n[which(rowMeans(is.na(missing_n)) > .6), ] 
missing_p <- d[,c("sub_it_con", "P100", "P200", "P300","P400","P500","P600")]
missing_p <- missing_p[which(rowMeans(is.na(missing_p)) > .6), ] 
miss <- bind_rows(missing_s ,missing_v, missing_g, missing_n, missing_p)

#filter out trials with equal/more than 5 NAs 
d <- subset(d, sub_it_con != "K1L2_8_3")
d <- subset(d, sub_it_con != "K1L4_24_0")
d <- subset(d, sub_it_con != "K1L4_25_1")
d <- subset(d, sub_it_con != "K1L4_3_4")
d <- subset(d, sub_it_con != "K1L4_6_3")
d <- subset(d, sub_it_con != "K2L4_17_1")
d <- subset(d, sub_it_con != "K3L1_10_1")
d <- subset(d, sub_it_con != "K3L1_14_1")
d <- subset(d, sub_it_con != "K3L1_16_4")
d <- subset(d, sub_it_con != "K3L1_19_3")
d <- subset(d, sub_it_con != "K3L1_23_3")
d <- subset(d, sub_it_con != "K3L1_24_4")
d <- subset(d, sub_it_con != "K3L1_30_1")
d <- subset(d, sub_it_con != "K3L1_32_4")
d <- subset(d, sub_it_con != "K3L1_4_4")
d <- subset(d, sub_it_con != "K3L1_5_0")
d <- subset(d, sub_it_con != "K3L1_7_3")
d <- subset(d, sub_it_con != "K3L1_9_0")
d <- subset(d, sub_it_con != "K3L2_11_1")
d <- subset(d, sub_it_con != "K3L2_31_1")
d <- subset(d, sub_it_con != "K4L2_8_3")
d <- subset(d, sub_it_con != "K4L3_11_0")
d <- subset(d, sub_it_con != "K4L3_13_3")
d <- subset(d, sub_it_con != "K4L3_20_1")
d <- subset(d, sub_it_con != "K4L3_23_0")
d <- subset(d, sub_it_con != "K4L3_3_0")
d <- subset(d, sub_it_con != "K4L3_30_4")
d <- subset(d, sub_it_con != "K4L3_31_0")
d <- subset(d, sub_it_con != "K4L3_32_1")
d <- subset(d, sub_it_con != "K4L4_21_1")
d <- subset(d, sub_it_con != "K5L1_27_3")
d <- subset(d, sub_it_con != "K5L3_2_4")
d <- subset(d, sub_it_con != "K5L3_21_3")
d <- subset(d, sub_it_con != "K5L3_23_0")
d <- subset(d, sub_it_con != "K5L3_26_4")
d <- subset(d, sub_it_con != "K5L3_8_1")
d <- subset(d, sub_it_con != "K5L4_3_4")
d <- subset(d, sub_it_con != "K6L4_1_1")
d <- subset(d, sub_it_con != "K6L4_10_3")
d <- subset(d, sub_it_con != "K6L4_11_4")
d <- subset(d, sub_it_con != "K6L4_15_4")
d <- subset(d, sub_it_con != "K6L4_17_1")
d <- subset(d, sub_it_con != "K6L4_21_1")
d <- subset(d, sub_it_con != "K6L4_25_1")
d <- subset(d, sub_it_con != "K6L4_26_3")
d <- subset(d, sub_it_con != "K7L1_14_1")
d <- subset(d, sub_it_con != "K7L1_3_3")
d <- subset(d, sub_it_con != "K7L4_22_3")
d <- subset(d, sub_it_con != "K7L4_29_1")
d <- subset(d, sub_it_con != "K8L1_8_4")
d <- subset(d, sub_it_con != "K8L3_12_1")
d <- subset(d, sub_it_con != "K8L3_14_4")
d <- subset(d, sub_it_con != "K8L3_16_1")
d <- subset(d, sub_it_con != "K8L3_3_0")
d <- subset(d, sub_it_con != "K8L3_5_3")
d <- subset(d, sub_it_con != "K8L3_6_4")
d <- subset(d, sub_it_con != "K8L3_9_3")
d <- subset(d, sub_it_con != "K8L4_15_4")
d <- subset(d, sub_it_con != "K9L1_17_0")
d <- subset(d, sub_it_con != "K9L1_31_3")
d <- subset(d, sub_it_con != "P10L4_21_1")
d <- subset(d, sub_it_con != "P2L4_18_3")
d <- subset(d, sub_it_con != "P3L2_13_4")
d <- subset(d, sub_it_con != "P3L2_27_1")
d <- subset(d, sub_it_con != "P3L2_8_3")
d <- subset(d, sub_it_con != "P3L2_9_4")
d <- subset(d, sub_it_con != "P4L1_15_3")
d <- subset(d, sub_it_con != "P4L1_18_1")
d <- subset(d, sub_it_con != "P4L1_25_0")
d <- subset(d, sub_it_con != "P4L1_3_3")
d <- subset(d, sub_it_con != "P5L2_23_1")
d <- subset(d, sub_it_con != "P5L2_8_3")
d <- subset(d, sub_it_con != "P6L4_17_1")
d <- subset(d, sub_it_con != "P7L2_17_4")
d <- subset(d, sub_it_con != "P7L3_31_0")
d <- subset(d, sub_it_con != "P9L1_2_1")




##############################################
### RELATIVIZE TIME WINDOWS USING BASELINE ### 
##############################################

d <- mutate(d, ss = (s/b))
d <- mutate(d, vv = (v/b))
d <- mutate(d, gg = (g/b))
d <- mutate(d, nn = (n/b))
d <- mutate(d, pp = (p/b))

d <- mutate(d, sss = ifelse(ss > 1, ss-1, -(1-ss)))
d <- mutate(d, vvv = ifelse(vv > 1, vv-1, -(1-vv)))
d <- mutate(d, ggg = ifelse(gg > 1, gg-1, -(1-gg)))
d <- mutate(d, nnn = ifelse(nn > 1, nn-1, -(1-nn)))
d <- mutate(d, ppp = ifelse(pp > 1, pp-1, -(1-pp)))

d <- mutate(d, sss = sss*100)
d <- mutate(d, vvv = vvv*100)
d <- mutate(d, ggg = ggg*100)
d <- mutate(d, nnn = nnn*100)
d <- mutate(d, ppp = ppp*100)

d$s <- d$sss
d$v <- d$vvv
d$g <- d$ggg
d$n <- d$nnn
d$p <- d$ppp




###########################################
### OUTLIER REMOVAL WITH 1.5*IQR METHOD ### (per group and condition)
########################################### 

d <- unite(d, gro_con, group, condition, sep = "_", remove = FALSE)

###SUBJECT WINDOW 
d <- d %>% 
  group_by(gro_con)%>% 
  mutate(smean = mean(s, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(sIQR = IQR(s, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(sQ1 = quantile(s, probs = .25, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(sQ3 = quantile(s, probs = .75, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(s_clean = ifelse(s < sQ1 - 1.5 * sIQR | s > sQ3 + 1.5 * sIQR, smean, s))


###VERB WINDOW 
d <- d %>% 
  group_by(gro_con)%>% 
  mutate(vmean = mean(v, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(vIQR = IQR(v, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(vQ1 = quantile(v, probs = .25, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(vQ3 = quantile(v, probs = .75, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(v_clean = ifelse(v < vQ1 - 1.5 * vIQR | v > vQ3 + 1.5 * vIQR, vmean, v))


###GLEICH WINDOW 
d <- d %>% 
  group_by(gro_con)%>% 
  mutate(gmean = mean(g, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(gIQR = IQR(g, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(gQ1 = quantile(g, probs = .25, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(gQ3 = quantile(g, probs = .75, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(g_clean = ifelse(g < gQ1 - 1.5 * gIQR | g > gQ3 + 1.5 * gIQR, gmean , g))


###NOUN WINDOW 
d <- d %>% 
  group_by(gro_con)%>% 
  mutate(nmean = mean(n, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(nIQR = IQR(n, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(nQ1 = quantile(n, probs = .25, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(nQ3 = quantile(n, probs = .75, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(n_clean = ifelse(n < nQ1 - 1.5 * nIQR | n > nQ3 + 1.5 * nIQR, nmean , n))


###POST WINDOW 
d <- d %>% 
  group_by(gro_con)%>% 
  mutate(pmean = mean(p, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(pIQR = IQR(p, na.rm = TRUE))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(pQ1 = quantile(p, probs = .25, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(pQ3 = quantile(p, probs = .75, na.rm = T))

d <- d %>% 
  group_by(gro_con)%>% 
  mutate(p_clean = ifelse(p < pQ1 - 1.5 * pIQR | p > pQ3 + 1.5 * pIQR, pmean, p))




#################################
### BRING DATA IN LONG FORMAT ###
#################################

p <- d[,c("subject", "group", "item", "condition", "s_clean", "v_clean", "g_clean", "n_clean", "p_clean"
          , "ppvt", "flu", "csst", "accu", "age")] 

p$w01 <- p$s_clean
p$w02 <- p$v_clean
p$w03 <- p$g_clean
p$w04 <- p$n_clean
p$w05 <- p$p_clean

p <- gather(p, window, pupil, w01:w05)
p <- p[,c("subject", "group", "item", "condition", "window", "pupil", "ppvt", "flu", "csst", "accu", "age")]

p$window <- as.factor(p$window)
p$condition <- as.factor(p$condition)
p$pupil <- as.numeric(p$pupil)
p <- rename(p, replace = c("condition" = "Condition"))
p <- mutate(p, group = ifelse(group == "K", "Children", "Adults"))
p <- mutate(p, window = ifelse(p$window == "w01", "1", 
                        ifelse(p$window == "w02", "2", 
                        ifelse(p$window == "w03", "3",
                        ifelse(p$window == "w04", "4",
                        ifelse(p$window == "w05", "5",NA))))))

write.csv(p, "pupil_ready2analyse.csv")




############
### PLOT ###
############

plot <- summarySE(p, measurevar="pupil", groupvars=c("Condition", "window", "group"), na.rm=T)
ggplot(plot, aes(x=window, y=pupil, group=Condition, color=Condition, fill = Condition))+
  geom_line(stat = "identity")+
  geom_errorbar(position=position_dodge(.0),width=.2, aes(ymin=pupil-se, ymax=pupil+se))+
  facet_grid(~group) +
  geom_point(size=2) +
  geom_line(size =0.5) +
  theme_bw()+ 
  theme(strip.background = element_rect(color="black", fill="white", size=0.5, linetype="solid"))+ #kästchen beschriftung groups
  theme(strip.text.x = element_text(size = 12, color = "black"))+ #beschriftung groups
  ggtitle("Pupil Size Change")+
  theme(plot.title = element_text(face = "bold", size = 14))+
  xlab("Window") +
  ylab("Mean pupil size changes")+
  theme(aspect.ratio = 2/1)+ #stauchung/streckung
  theme(axis.title.x = element_text(size = 12))+
  theme(axis.title.y = element_text(size = 12))+
  theme(axis.text.x = element_text(size =10))+
  theme(axis.text.y = element_text(size =10))+
  theme(axis.title.x = element_text(vjust = -2))+ 
  theme(axis.title.y = element_text(vjust = 3))+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.3))+
  scale_x_discrete(labels=c("1" = "Subject", "2" = "Verb","3" = "Spill-over", "4" = "Noun", "5" = "Postview"))+ #renaming x achsis 
  scale_y_continuous(labels = function(x) paste0(x*1, "%"))+ #% value to y lab
  scale_color_manual(values=c("darkred", "darkorange", "blue", "darkolivegreen"), labels=c("0", "1", "3", "4")) 

#ggsave("pupil_plot.png")




#################################################################################################################################################
### ANAlYSES ####################################################################################################################################
#################################################################################################################################################

d <- read.csv("pupil_ready2analyse.csv")
d <- rename(d, replace = c("Condition" = "condition"))




##############################
### DESCREPTIVE STATISTICS ###
##############################

d <- unite(d, gro_con_win, group, condition, window, sep = "_", remove = FALSE)
summarize(group_by(d, gro_con_win), m=mean(pupil, na.rm = TRUE ), sd=sd(pupil, na.rm = TRUE))

d <- unite(d, con_win, condition, window, sep = "_", remove = FALSE)
summarize(group_by(d, con_win), m=mean(pupil, na.rm = TRUE ), sd=sd(pupil, na.rm = TRUE))




#####################
### EFFECT CODING ###
#####################

d <- mutate(d,v1 = ifelse(condition == "0", 3/4, -1/4),                                                            #0 vs. 1, 3 & 4
            v2 = ifelse(condition == "0", 0, ifelse(condition == "1", 2/3, -1/3)),                                 #1 vs. 3 & 4
            v3 = ifelse(condition == "0", 0, ifelse(condition == "1", 0, ifelse(condition == "3", 1/2, -1/2))))    #3 vs. 4

d <- mutate(d, a = ifelse(group == "Adults", 1/2, -1/2))   #adults vs. children




#############################################
### LINEAR MIXED EFFECTS MODELS - OVERALL ### (converged version; per time window; with the effect coded factors condition and age group)
#############################################

ws <- subset(d, window == "1")
wv <- subset(d, window == "2")
wg <- subset(d, window == "3")
wn <- subset(d, window == "4")
wp <- subset(d, window == "5")


###SUBJECT 
ms <- lmer(pupil ~ (v1+v2+v3) * a +
             (1 + (v1+v2) || subject)+
             (1 + (v1+v3) || item),
           data=ws) 
summary(ms)
confint(ms)


###VERB 
mv <- lmer(pupil ~ (v1+v2+v3) * a +
             (1 + (v2+v3) || subject)+
             (1 + (v1+v2) || item),
           data=wv) 
summary(mv)
confint(mv)


###GLEICH 
mg <- lmer(pupil ~ (v1+v2+v3) * a +
             (1 + (v1+v2) || subject)+
             (1 + (v1)  || item),
           data=wg) 
summary(mg)
confint(mg)


###NOUN 
mn <- lmer(pupil ~ (v1+v2+v3) * a +
             (1 + (v1+v2+v3) || subject)+
             (1 + (v1+v2)*a  || item),
           data=wn) 
summary(mn)
confint(mn)


###POST 
mp <- lmer(pupil ~ (v1+v2+v3) * a +
             (1 + (v1) || subject)+
             (1 + (v2) || item),
           data=wp) 
summary(mp)
confint(mp)




###################
### ADULTS ONLY ###
###################

adu <- subset(d, group == "Adults")

adu_ws <- subset(adu, window == "1")
adu_wv <- subset(adu, window == "2")
adu_wg <- subset(adu, window == "3")
adu_wn <- subset(adu, window == "4")
adu_wp <- subset(adu, window == "5") 


###SUBJECT  
adu_ms <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v1+v2+v3) || subject)
               ,
               data=adu_ws) 
summary(adu_ms)
confint(adu_ms)


###VERB 
adu_mv <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v2) || subject),
               data=adu_wv) 
summary(adu_mv)
confint(adu_mv)


###GLEICH 
adu_mg <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v2) || subject)+
                 (1 +(v1+v2) || item),
               data=adu_wg) 
summary(adu_mg)
confint(adu_mg)


###NOUN 
adu_mn <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v1+v2+v3) || subject)+
                 (1 +(v1+v2) || item),
               data=adu_wn) 
summary(adu_mn)
confint(adu_mn)


###POST 
adu_mp <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v1+v3) || subject)+
                 (1 +(v1+v2) || item),
               data=adu_wp) 
summary(adu_mp)
confint(adu_mp)




#######################
### CHILDREN - ONLY ###
#######################

kid <- subset(d, group == "Children")

kid_ws <- subset(kid, window == "1")
kid_wv <- subset(kid, window == "2")
kid_wg <- subset(kid, window == "3")
kid_wn <- subset(kid, window == "4")
kid_wp <- subset(kid, window == "5") 


###SUBJECT 
kid_ms <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v2) || subject)+
                 (1 +(v1+v2) || item),
               data=kid_ws) 
summary(kid_ms)
confint(kid_ms)


###VERB 
kid_mv <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v1+v2+v3) || subject)+
                 (1 +(v1+v2+v3) || item),
               data=kid_wv) 
summary(kid_mv)
confint(kid_mv)


###GLEICH 
kid_mg <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v1+v2) || subject)+
                 (1 +(v2+v3) || item),
               data=kid_wg) 
summary(kid_mg)
confint(kid_mg)


###NOUN 
kid_mn <- lmer(pupil ~ (v1+v2+v3)  +
                 (1 +(v1+v2+v3) || subject)+
                 (1 +(v2+v3) || item),
               data=kid_wn) 
summary(kid_mn)
confint(kid_mn)


###POST 
kid_mp <- lmer(pupil ~ (v1+v2+v3)   +
                 (1 +(v2+v3) || subject)+
                 (1 +(v3) || item),
               data=kid_wp) 
summary(kid_mp)
confint(kid_mp)